#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
## ==========================================================================
## *
## *  Package WGCNA 1.66 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=4
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=4
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================
library(expss)
library(polycor)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

clustering_selected = 'DynamicHybrid'

print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybrid"

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% 
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


# GO DEA
# load('./../Data/Gandal/GO_DE_clusters.RData')

rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 62 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #5CB300 #D675FD #00BF78 #F663E2 #FE61D0 #F07F4A #00C0B5 #E06FF8 #FF61C5 
##    1298    1277    1158    1149    1086     639     614     566     441 
## #00BE69 #FF64AF #00BC59 #00BBDC #00C19E #E58702 #6B9AFF #AFA200 #8DAB00 
##     395     366     328     327     312     295     291     277     259 
## #F47A5D #48A0FF #00B2F3 #FE6E8A #FF6A97 #00AEF9 #00AAFE #00C085 #13B700 
##     259     254     247     239     217     215     205     197     182 
##    gray #00B930 #C29B00 #CA7BFF #FA62D9 #00A5FF #E96AF1 #F066EA #F8766D 
##     178     176     175     168     166     160     158     158     131 
## #A5A500 #43B500 #DF8B00 #D19300 #FF67A3 #FF62BB #CA9700 #00C0BF #00C1AA 
##     128     125     123     120     115     114     106      99      97 
## #D88F00 #6FB000 #EB8332 #FB727C #8594FF #B99E00 #00C092 #00BEC9 #BD81FF 
##      97      95      94      84      82      72      62      57      52 
## #00B6EC #AD87FF #9B8EFF #00B9E4 #9AA800 #7FAE00 #00BB47 #00BDD3 
##      45      45      42      41      40      38      32      32

Module-Trait associations

datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)

Selecting the three modules with highest (absolute) correlation to Diagnosis: #D675FD, #00BF78 and #13B700

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module=='#D675FD', '#D675FD',
                                      ifelse(Module=='#00BF78', '#00BF78',
                                      ifelse(Module=='#13B700', '#13B700', 'Others')))) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.1, 0.5))

table(plot_data$ImportantModules)
## 
## #00BF78 #13B700 #D675FD  Others 
##    1158     182    1277   13983
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))

Gene Significance

Comparing this with the DEA plot, Gene significance close to 0 means the gene expression is not affected by Diagnosis and negative values means the gene is underexpressed in Autism samples. So the important thing about gene significance is its absolute value

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + 
         scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
  theme_minimal() + ggtitle('Gene Significance'))

Module Membership and Gene Significance relation

Genes from the module #D675FD

Module with the highest (absolute) correlation to Diagnosis (-0.93)

plot_data = dataset %>% dplyr::select(MM.D675FD, GS, gene.score) %>% filter(dataset$Module=='#D675FD')

ggplotly(plot_data %>% ggplot(aes(MM.D675FD, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal())

Genes from the module #00BF78

Module with the second highest (absolute) correlation to Diagnosis (0.91)

plot_data = dataset %>% dplyr::select(MM.00BF78, GS, gene.score) %>% filter(dataset$Module=='#00BF78')

ggplotly(plot_data %>% ggplot(aes(MM.00BF78, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal())

Genes from the module #13B700

Module with the third highest (absolute) correlation to Diagnosis and the top one with positive correlation (0.91)

plot_data = dataset %>% dplyr::select(MM.13B700, GS, gene.score) %>% filter(dataset$Module=='#13B700')


ggplotly(plot_data %>% ggplot(aes(MM.13B700, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(2:6,8,7))) + 
         theme_minimal())

Identifying important genes

Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+GS}{2}\)

Only three SFARI Genes in the three lists …

top_genes_1 = dataset %>% dplyr::select(ID, MM.D675FD, GS, gene.score) %>% filter(dataset$Module=='#D675FD') %>%
              rename('MM' = 'MM.D675FD') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_1, caption='Top 10 genes for module #D675FD')
Top 10 genes for module #D675FD
ID MM GS gene.score importance
ENSG00000050748 0.9014380 -0.9399810 None 0.9207095
ENSG00000108395 0.9067664 -0.9250494 None 0.9159079
ENSG00000138078 0.8923376 -0.8966280 None 0.8944828
ENSG00000177432 0.8590945 -0.8915541 None 0.8753243
ENSG00000163577 0.8377986 -0.9116043 None 0.8747015
ENSG00000176490 0.8551447 -0.8936753 None 0.8744100
ENSG00000155097 0.8548700 -0.8885175 None 0.8716938
ENSG00000128881 0.8740293 -0.8654230 None 0.8697261
ENSG00000171132 0.8345108 -0.9028094 None 0.8686601
ENSG00000114573 0.8277575 -0.9068579 None 0.8673077
top_genes_2 = dataset %>% dplyr::select(ID, MM.00BF78, GS, gene.score) %>% filter(dataset$Module=='#00BF78') %>%
              rename('MM' = 'MM.00BF78') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_2, caption='Top 10 genes for module #00BF78')
Top 10 genes for module #00BF78
ID MM GS gene.score importance
ENSG00000180573 0.8413858 0.8648084 None 0.8530971
ENSG00000181722 0.8640272 0.8406765 3 0.8523519
ENSG00000003402 0.8807946 0.8010251 None 0.8409098
ENSG00000184678 0.7903735 0.8878029 None 0.8390882
ENSG00000120690 0.7921926 0.7836121 None 0.7879024
ENSG00000107104 0.8162030 0.7554489 4 0.7858260
ENSG00000084093 0.8091856 0.7472778 None 0.7782317
ENSG00000198879 0.8039142 0.7457312 None 0.7748227
ENSG00000198185 0.7958118 0.7482581 None 0.7720349
ENSG00000051620 0.7890904 0.7462823 None 0.7676863
top_genes_3 = dataset %>% dplyr::select(ID, MM.13B700, GS, gene.score) %>% filter(dataset$Module=='#13B700') %>%
              rename('MM' = 'MM.13B700') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_3, caption='Top 10 genes for module #13B700')
Top 10 genes for module #13B700
ID MM GS gene.score importance
ENSG00000089159 0.7939783 0.8730915 None 0.8335349
ENSG00000124782 0.8015047 0.8441104 None 0.8228075
ENSG00000138119 0.7761529 0.8385010 None 0.8073269
ENSG00000162745 0.7239416 0.8582103 None 0.7910759
ENSG00000168769 0.7488512 0.8144247 3 0.7816379
ENSG00000099875 0.7556764 0.7922251 None 0.7739507
ENSG00000145390 0.7691139 0.7401723 None 0.7546431
ENSG00000110719 0.6636778 0.8248276 None 0.7442527
ENSG00000178104 0.6935897 0.7847519 None 0.7391708
ENSG00000140836 0.7893068 0.6879968 None 0.7386518
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module=='#D675FD', '#D675FD',
                           ifelse(Module=='#00BF78', '#00BF78',
                           ifelse(Module=='#13B700', '#13B700', 'gray')))) %>%
            mutate(alpha = ifelse(color %in% c('#D675FD', '#00BF78', '#13B700') & 
                                  ID %in% c(as.character(top_genes_1$ID), 
                                            as.character(top_genes_2$ID),
                                            as.character(top_genes_3$ID)), 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')


SFARI genes and Module-Diagnosis correlation

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0, and the behaviour is the same for the other clusterings I have tried…

A good thing is that the gray cluster (which is actually all the genes that were left without a cluster) has a low correlation with ASD and also a low percentage of SFARI genes

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, aes(id=Module)) +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row)